Thera bank recently saw a steep decline in the number of credit card users. Credit cards are a good source of income for banks because they can charge a variety of fees to the users. Some fees are charged to every user irrespective of usage, while others are charged under specified circumstances.
The loss of credit card clients is affecting the bank's bottom line, so the bank wants to analyze the data of customers and identify the clients who will end their credit card services, so the bank can attempt to retain them as credit card clients.
As a Data scientist for Thera bank, I need to derive a classification model that will help the bank improve its services so that customers do not renounce their credit cards.
# import the libraries
import numpy as np
import pandas as pd
# Libraries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# libraries to build models
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
# To tune model, get different metric scores and split data
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import (
f1_score,
accuracy_score,
recall_score,
precision_score,
confusion_matrix,
roc_auc_score,
plot_confusion_matrix,
)
# To oversample and undersample data
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# To suppress the warnings
import warnings
warnings.filterwarnings("ignore")
churn = pd.read_csv('BankChurners.csv')
df = churn.copy(deep = True)
df.shape
(10127, 21)
df.head(20)
| CLIENTNUM | Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | ... | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 768805383 | Existing Customer | 45 | M | 3 | High School | Married | $60K - $80K | Blue | 39 | ... | 1 | 3 | 12691.0 | 777 | 11914.0 | 1.335 | 1144 | 42 | 1.625 | 0.061 |
| 1 | 818770008 | Existing Customer | 49 | F | 5 | Graduate | Single | Less than $40K | Blue | 44 | ... | 1 | 2 | 8256.0 | 864 | 7392.0 | 1.541 | 1291 | 33 | 3.714 | 0.105 |
| 2 | 713982108 | Existing Customer | 51 | M | 3 | Graduate | Married | $80K - $120K | Blue | 36 | ... | 1 | 0 | 3418.0 | 0 | 3418.0 | 2.594 | 1887 | 20 | 2.333 | 0.000 |
| 3 | 769911858 | Existing Customer | 40 | F | 4 | High School | NaN | Less than $40K | Blue | 34 | ... | 4 | 1 | 3313.0 | 2517 | 796.0 | 1.405 | 1171 | 20 | 2.333 | 0.760 |
| 4 | 709106358 | Existing Customer | 40 | M | 3 | Uneducated | Married | $60K - $80K | Blue | 21 | ... | 1 | 0 | 4716.0 | 0 | 4716.0 | 2.175 | 816 | 28 | 2.500 | 0.000 |
| 5 | 713061558 | Existing Customer | 44 | M | 2 | Graduate | Married | $40K - $60K | Blue | 36 | ... | 1 | 2 | 4010.0 | 1247 | 2763.0 | 1.376 | 1088 | 24 | 0.846 | 0.311 |
| 6 | 810347208 | Existing Customer | 51 | M | 4 | NaN | Married | $120K + | Gold | 46 | ... | 1 | 3 | 34516.0 | 2264 | 32252.0 | 1.975 | 1330 | 31 | 0.722 | 0.066 |
| 7 | 818906208 | Existing Customer | 32 | M | 0 | High School | NaN | $60K - $80K | Silver | 27 | ... | 2 | 2 | 29081.0 | 1396 | 27685.0 | 2.204 | 1538 | 36 | 0.714 | 0.048 |
| 8 | 710930508 | Existing Customer | 37 | M | 3 | Uneducated | Single | $60K - $80K | Blue | 36 | ... | 2 | 0 | 22352.0 | 2517 | 19835.0 | 3.355 | 1350 | 24 | 1.182 | 0.113 |
| 9 | 719661558 | Existing Customer | 48 | M | 2 | Graduate | Single | $80K - $120K | Blue | 36 | ... | 3 | 3 | 11656.0 | 1677 | 9979.0 | 1.524 | 1441 | 32 | 0.882 | 0.144 |
| 10 | 708790833 | Existing Customer | 42 | M | 5 | Uneducated | NaN | $120K + | Blue | 31 | ... | 3 | 2 | 6748.0 | 1467 | 5281.0 | 0.831 | 1201 | 42 | 0.680 | 0.217 |
| 11 | 710821833 | Existing Customer | 65 | M | 1 | NaN | Married | $40K - $60K | Blue | 54 | ... | 2 | 3 | 9095.0 | 1587 | 7508.0 | 1.433 | 1314 | 26 | 1.364 | 0.174 |
| 12 | 710599683 | Existing Customer | 56 | M | 1 | College | Single | $80K - $120K | Blue | 36 | ... | 6 | 0 | 11751.0 | 0 | 11751.0 | 3.397 | 1539 | 17 | 3.250 | 0.000 |
| 13 | 816082233 | Existing Customer | 35 | M | 3 | Graduate | NaN | $60K - $80K | Blue | 30 | ... | 1 | 3 | 8547.0 | 1666 | 6881.0 | 1.163 | 1311 | 33 | 2.000 | 0.195 |
| 14 | 712396908 | Existing Customer | 57 | F | 2 | Graduate | Married | Less than $40K | Blue | 48 | ... | 2 | 2 | 2436.0 | 680 | 1756.0 | 1.190 | 1570 | 29 | 0.611 | 0.279 |
| 15 | 714885258 | Existing Customer | 44 | M | 4 | NaN | NaN | $80K - $120K | Blue | 37 | ... | 1 | 2 | 4234.0 | 972 | 3262.0 | 1.707 | 1348 | 27 | 1.700 | 0.230 |
| 16 | 709967358 | Existing Customer | 48 | M | 4 | Post-Graduate | Single | $80K - $120K | Blue | 36 | ... | 2 | 3 | 30367.0 | 2362 | 28005.0 | 1.708 | 1671 | 27 | 0.929 | 0.078 |
| 17 | 753327333 | Existing Customer | 41 | M | 3 | NaN | Married | $80K - $120K | Blue | 34 | ... | 4 | 1 | 13535.0 | 1291 | 12244.0 | 0.653 | 1028 | 21 | 1.625 | 0.095 |
| 18 | 806160108 | Existing Customer | 61 | M | 1 | High School | Married | $40K - $60K | Blue | 56 | ... | 2 | 3 | 3193.0 | 2517 | 676.0 | 1.831 | 1336 | 30 | 1.143 | 0.788 |
| 19 | 709327383 | Existing Customer | 45 | F | 2 | Graduate | Married | abc | Blue | 37 | ... | 1 | 2 | 14470.0 | 1157 | 13313.0 | 0.966 | 1207 | 21 | 0.909 | 0.080 |
20 rows × 21 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CLIENTNUM 10127 non-null int64 1 Attrition_Flag 10127 non-null object 2 Customer_Age 10127 non-null int64 3 Gender 10127 non-null object 4 Dependent_count 10127 non-null int64 5 Education_Level 8608 non-null object 6 Marital_Status 9378 non-null object 7 Income_Category 10127 non-null object 8 Card_Category 10127 non-null object 9 Months_on_book 10127 non-null int64 10 Total_Relationship_Count 10127 non-null int64 11 Months_Inactive_12_mon 10127 non-null int64 12 Contacts_Count_12_mon 10127 non-null int64 13 Credit_Limit 10127 non-null float64 14 Total_Revolving_Bal 10127 non-null int64 15 Avg_Open_To_Buy 10127 non-null float64 16 Total_Amt_Chng_Q4_Q1 10127 non-null float64 17 Total_Trans_Amt 10127 non-null int64 18 Total_Trans_Ct 10127 non-null int64 19 Total_Ct_Chng_Q4_Q1 10127 non-null float64 20 Avg_Utilization_Ratio 10127 non-null float64 dtypes: float64(5), int64(10), object(6) memory usage: 1.6+ MB
# Changing objects to categorical, getting value counts for only categorical variables
cat_cols = ['Attrition_Flag','Gender','Education_Level','Marital_Status','Income_Category','Card_Category']
for col in cat_cols:
if df[col].dtype == 'object':
df[col] = pd.Categorical(df[col])
for col in cat_cols:
vc_col = df[col].value_counts()
print(vc_col)
Existing Customer 8500 Attrited Customer 1627 Name: Attrition_Flag, dtype: int64 F 5358 M 4769 Name: Gender, dtype: int64 Graduate 3128 High School 2013 Uneducated 1487 College 1013 Post-Graduate 516 Doctorate 451 Name: Education_Level, dtype: int64 Married 4687 Single 3943 Divorced 748 Name: Marital_Status, dtype: int64 Less than $40K 3561 $40K - $60K 1790 $80K - $120K 1535 $60K - $80K 1402 abc 1112 $120K + 727 Name: Income_Category, dtype: int64 Blue 9436 Silver 555 Gold 116 Platinum 20 Name: Card_Category, dtype: int64
# Statistical analysis
df.describe()
| CLIENTNUM | Customer_Age | Dependent_count | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.012700e+04 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 |
| mean | 7.391776e+08 | 46.325960 | 2.346203 | 35.928409 | 3.812580 | 2.341167 | 2.455317 | 8631.953698 | 1162.814061 | 7469.139637 | 0.759941 | 4404.086304 | 64.858695 | 0.712222 | 0.274894 |
| std | 3.690378e+07 | 8.016814 | 1.298908 | 7.986416 | 1.554408 | 1.010622 | 1.106225 | 9088.776650 | 814.987335 | 9090.685324 | 0.219207 | 3397.129254 | 23.472570 | 0.238086 | 0.275691 |
| min | 7.080821e+08 | 26.000000 | 0.000000 | 13.000000 | 1.000000 | 0.000000 | 0.000000 | 1438.300000 | 0.000000 | 3.000000 | 0.000000 | 510.000000 | 10.000000 | 0.000000 | 0.000000 |
| 25% | 7.130368e+08 | 41.000000 | 1.000000 | 31.000000 | 3.000000 | 2.000000 | 2.000000 | 2555.000000 | 359.000000 | 1324.500000 | 0.631000 | 2155.500000 | 45.000000 | 0.582000 | 0.023000 |
| 50% | 7.179264e+08 | 46.000000 | 2.000000 | 36.000000 | 4.000000 | 2.000000 | 2.000000 | 4549.000000 | 1276.000000 | 3474.000000 | 0.736000 | 3899.000000 | 67.000000 | 0.702000 | 0.176000 |
| 75% | 7.731435e+08 | 52.000000 | 3.000000 | 40.000000 | 5.000000 | 3.000000 | 3.000000 | 11067.500000 | 1784.000000 | 9859.000000 | 0.859000 | 4741.000000 | 81.000000 | 0.818000 | 0.503000 |
| max | 8.283431e+08 | 73.000000 | 5.000000 | 56.000000 | 6.000000 | 6.000000 | 6.000000 | 34516.000000 | 2517.000000 | 34516.000000 | 3.397000 | 18484.000000 | 139.000000 | 3.714000 | 0.999000 |
# dropping "CLIENTNUM", confirming that it's gone and that objects are now
# encoded as categorical
df.drop(["CLIENTNUM"], axis=1, inplace=True)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Attrition_Flag 10127 non-null category 1 Customer_Age 10127 non-null int64 2 Gender 10127 non-null category 3 Dependent_count 10127 non-null int64 4 Education_Level 8608 non-null category 5 Marital_Status 9378 non-null category 6 Income_Category 10127 non-null category 7 Card_Category 10127 non-null category 8 Months_on_book 10127 non-null int64 9 Total_Relationship_Count 10127 non-null int64 10 Months_Inactive_12_mon 10127 non-null int64 11 Contacts_Count_12_mon 10127 non-null int64 12 Credit_Limit 10127 non-null float64 13 Total_Revolving_Bal 10127 non-null int64 14 Avg_Open_To_Buy 10127 non-null float64 15 Total_Amt_Chng_Q4_Q1 10127 non-null float64 16 Total_Trans_Amt 10127 non-null int64 17 Total_Trans_Ct 10127 non-null int64 18 Total_Ct_Chng_Q4_Q1 10127 non-null float64 19 Avg_Utilization_Ratio 10127 non-null float64 dtypes: category(6), float64(5), int64(9) memory usage: 1.1 MB
# Statistical summary of the categorical columns
df.describe(exclude=np.number).T
| count | unique | top | freq | |
|---|---|---|---|---|
| Attrition_Flag | 10127 | 2 | Existing Customer | 8500 |
| Gender | 10127 | 2 | F | 5358 |
| Education_Level | 8608 | 6 | Graduate | 3128 |
| Marital_Status | 9378 | 3 | Married | 4687 |
| Income_Category | 10127 | 6 | Less than $40K | 3561 |
| Card_Category | 10127 | 4 | Blue | 9436 |
# Function to create a combo boxplot/histogram with same x-axis.
def histogram_boxplot(data, feature, figsize=(12, 7), kde=False, bins=None):
"""
Boxplot and histogram combined
data: dataframe
feature: dataframe column
figsize: size of figure (default (12,7))
kde: whether to the show density curve (default False)
bins: number of bins for histogram (default None)
"""
f2, (ax_box2, ax_hist2) = plt.subplots(
nrows=2,
sharex=True, # x-axis will be shared
gridspec_kw={"height_ratios": (0.25, 0.75)},
figsize=figsize,
) # creating 2 subplots
sns.boxplot(
data=data, x=feature, ax=ax_box2, showmeans=True, color="violet"
) # star on mean value of the column for the boxplot
sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2, bins=bins, palette="winter"
) if bins else sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2
)
ax_hist2.axvline(
data[feature].mean(), color="green", linestyle="--"
) # Green line on mean for the histogram
ax_hist2.axvline(
data[feature].median(), color="black", linestyle="-"
) # Black line on median for the histogram
histogram_boxplot(df, "Customer_Age")
histogram_boxplot(df, "Dependent_count")
histogram_boxplot(df, "Months_on_book")
histogram_boxplot(df, "Total_Relationship_Count")
histogram_boxplot(df, "Months_Inactive_12_mon")
histogram_boxplot(df, "Contacts_Count_12_mon")
histogram_boxplot(df, "Credit_Limit")
# Distribution is not normal. I will try taking the log of this variable to see if it normalizes
df['Credit_Limit_Log'] = np.log(df['Credit_Limit'])
histogram_boxplot(df, 'Credit_Limit_Log')
histogram_boxplot(df, "Total_Revolving_Bal")
histogram_boxplot(df, "Avg_Open_To_Buy")
# Not normal, will try a log transformation
df['Avg_Open_To_Buy_Log'] = np.log(df['Avg_Open_To_Buy'])
histogram_boxplot(df, 'Avg_Open_To_Buy_Log')
# That doesn't look much better. Trying with the square root
df['Avg_Open_To_Buy_Sqrt'] = np.sqrt(df['Avg_Open_To_Buy'])
histogram_boxplot(df, 'Avg_Open_To_Buy_Sqrt')
histogram_boxplot(df, "Total_Amt_Chng_Q4_Q1")
histogram_boxplot(df, "Total_Trans_Amt")
histogram_boxplot(df, "Total_Trans_Ct")
histogram_boxplot(df, "Total_Ct_Chng_Q4_Q1")
histogram_boxplot(df, "Avg_Utilization_Ratio")
# not normally distributed. I will try taking the log of the variable
# Since there are 0's, and you can't take the log of non-positive numbers, I will add 0.0001 to the
# variable before taking the log:
df['Avg_Utilization_Ratio_Log'] = np.log(df['Avg_Utilization_Ratio'] + 0.01)
histogram_boxplot(df, 'Avg_Utilization_Ratio_Log')
# Still looks strange. I will try taking the square root:
df['Avg_Utilization_Ratio_Sqrt'] = np.sqrt(df['Avg_Utilization_Ratio'])
histogram_boxplot(df, 'Avg_Utilization_Ratio_Sqrt')
# function to create labeled barplots
def labeled_barplot(data, feature, perc=False, n=None):
"""
Barplot with percentage at the top
data: dataframe
feature: dataframe column
perc: whether to display percentages instead of count (default is False)
n: displays the top n category levels (default is None, i.e., display all levels)
"""
total = len(data[feature]) # length of the column
count = data[feature].nunique()
if n is None:
plt.figure(figsize=(count + 1, 5))
else:
plt.figure(figsize=(n + 1, 5))
plt.xticks(rotation=90, fontsize=15)
ax = sns.countplot(
data=data,
x=feature,
palette="Paired",
order=data[feature].value_counts().index[:n].sort_values(),
)
for p in ax.patches:
if perc == True:
label = "{:.1f}%".format(
100 * p.get_height() / total
) # percentage of each class of the category
else:
label = p.get_height() # count of each level of the category
x = p.get_x() + p.get_width() / 2 # width of the plot
y = p.get_height() # height of the plot
ax.annotate(
label,
(x, y),
ha="center",
va="center",
size=12,
xytext=(0, 5),
textcoords="offset points",
) # annotate the percentage
plt.show() # show the plot
labeled_barplot(df, "Dependent_count")
labeled_barplot(df, "Total_Relationship_Count")
labeled_barplot(df, "Months_Inactive_12_mon")
labeled_barplot(df, "Contacts_Count_12_mon")
labeled_barplot(df, "Attrition_Flag")
labeled_barplot(df, "Gender")
labeled_barplot(df, "Education_Level")
labeled_barplot(df, "Marital_Status")
labeled_barplot(df, "Income_Category")
labeled_barplot(df, "Card_Category")
# dropping variables that had a log or sqrt transformation, as well as variables created by me
# with a log transformation that was not successful
# Also, converting "Attrition Flag" to numeric, with "Existing Customer"
# encoded as "0", and "Attrited Customer" encoded as "1"
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 25 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Attrition_Flag 10127 non-null category 1 Customer_Age 10127 non-null int64 2 Gender 10127 non-null category 3 Dependent_count 10127 non-null int64 4 Education_Level 8608 non-null category 5 Marital_Status 9378 non-null category 6 Income_Category 10127 non-null category 7 Card_Category 10127 non-null category 8 Months_on_book 10127 non-null int64 9 Total_Relationship_Count 10127 non-null int64 10 Months_Inactive_12_mon 10127 non-null int64 11 Contacts_Count_12_mon 10127 non-null int64 12 Credit_Limit 10127 non-null float64 13 Total_Revolving_Bal 10127 non-null int64 14 Avg_Open_To_Buy 10127 non-null float64 15 Total_Amt_Chng_Q4_Q1 10127 non-null float64 16 Total_Trans_Amt 10127 non-null int64 17 Total_Trans_Ct 10127 non-null int64 18 Total_Ct_Chng_Q4_Q1 10127 non-null float64 19 Avg_Utilization_Ratio 10127 non-null float64 20 Credit_Limit_Log 10127 non-null float64 21 Avg_Open_To_Buy_Log 10127 non-null float64 22 Avg_Open_To_Buy_Sqrt 10127 non-null float64 23 Avg_Utilization_Ratio_Log 10127 non-null float64 24 Avg_Utilization_Ratio_Sqrt 10127 non-null float64 dtypes: category(6), float64(10), int64(9) memory usage: 1.5 MB
df.drop(["Avg_Utilization_Ratio", "Avg_Utilization_Ratio_Log", "Avg_Open_To_Buy",
"Avg_Open_To_Buy_Log", "Credit_Limit", ], axis=1, inplace=True)
df["Attrition_Flag"].replace("Existing Customer", 0, inplace=True)
df["Attrition_Flag"].replace("Attrited Customer", 1, inplace=True)
df['Attrition_Flag'] = df.Attrition_Flag.astype('int64')
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Attrition_Flag 10127 non-null int64 1 Customer_Age 10127 non-null int64 2 Gender 10127 non-null category 3 Dependent_count 10127 non-null int64 4 Education_Level 8608 non-null category 5 Marital_Status 9378 non-null category 6 Income_Category 10127 non-null category 7 Card_Category 10127 non-null category 8 Months_on_book 10127 non-null int64 9 Total_Relationship_Count 10127 non-null int64 10 Months_Inactive_12_mon 10127 non-null int64 11 Contacts_Count_12_mon 10127 non-null int64 12 Total_Revolving_Bal 10127 non-null int64 13 Total_Amt_Chng_Q4_Q1 10127 non-null float64 14 Total_Trans_Amt 10127 non-null int64 15 Total_Trans_Ct 10127 non-null int64 16 Total_Ct_Chng_Q4_Q1 10127 non-null float64 17 Credit_Limit_Log 10127 non-null float64 18 Avg_Open_To_Buy_Sqrt 10127 non-null float64 19 Avg_Utilization_Ratio_Sqrt 10127 non-null float64 dtypes: category(5), float64(5), int64(10) memory usage: 1.2 MB
sns.pairplot(data=df, corner = True, diag_kind="kde")
<seaborn.axisgrid.PairGrid at 0x22512b46640>
plt.figure(figsize=(15, 7))
sns.heatmap(df.corr(), annot=True, vmin=-1, vmax=1, fmt=".2f", cmap="Spectral")
plt.show()
# function to plot stacked bar chart
def stacked_barplot(data, predictor, target):
"""
Print the category counts and plot a stacked bar chart
data: dataframe
predictor: independent variable
target: target variable
"""
count = data[predictor].nunique()
sorter = data[target].value_counts().index[-1]
tab1 = pd.crosstab(data[predictor], data[target], margins=True).sort_values(
by=sorter, ascending=False
)
print(tab1)
print("-" * 120)
tab = pd.crosstab(data[predictor], data[target], normalize="index").sort_values(
by=sorter, ascending=False
)
tab.plot(kind="bar", stacked=True, figsize=(count + 1, 5))
plt.legend(
loc="lower left",
frameon=False,
)
plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
plt.show()
stacked_barplot(df, "Gender", "Attrition_Flag")
Attrition_Flag 0 1 All Gender All 8500 1627 10127 F 4428 930 5358 M 4072 697 4769 ------------------------------------------------------------------------------------------------------------------------
stacked_barplot(df, "Education_Level", "Attrition_Flag")
Attrition_Flag 0 1 All Education_Level All 7237 1371 8608 Graduate 2641 487 3128 High School 1707 306 2013 Uneducated 1250 237 1487 College 859 154 1013 Doctorate 356 95 451 Post-Graduate 424 92 516 ------------------------------------------------------------------------------------------------------------------------
stacked_barplot(df, "Marital_Status", "Attrition_Flag")
Attrition_Flag 0 1 All Marital_Status All 7880 1498 9378 Married 3978 709 4687 Single 3275 668 3943 Divorced 627 121 748 ------------------------------------------------------------------------------------------------------------------------
stacked_barplot(df, "Income_Category", "Attrition_Flag")
Attrition_Flag 0 1 All Income_Category All 8500 1627 10127 Less than $40K 2949 612 3561 $40K - $60K 1519 271 1790 $80K - $120K 1293 242 1535 $60K - $80K 1213 189 1402 abc 925 187 1112 $120K + 601 126 727 ------------------------------------------------------------------------------------------------------------------------
stacked_barplot(df, "Card_Category", "Attrition_Flag")
Attrition_Flag 0 1 All Card_Category All 8500 1627 10127 Blue 7917 1519 9436 Silver 473 82 555 Gold 95 21 116 Platinum 15 5 20 ------------------------------------------------------------------------------------------------------------------------
Recall, which gives the ratio of true positives to actual positives, is the most important metric. I will calculate accuracy, precision, and F1 as well, but my goal will be to maximize Recall.X = df.drop(["Attrition_Flag"], axis=1)
y = df["Attrition_Flag"]
# Splitting data into Train, Validation and Test sets
# First, I will split into temporary and test, with 20% in test
X_temp, X_test, y_temp, y_test = train_test_split(
X, y, test_size=0.2, random_state=1, stratify=y
)
# Then, I will split temp set into train and validation
X_train, X_val, y_train, y_val = train_test_split(
X_temp, y_temp, test_size=0.25, random_state=1, stratify=y_temp
)
# finding the size of each dataset
print(X_train.shape, X_val.shape, X_test.shape)
(6075, 19) (2026, 19) (2026, 19)
# To prevent data leakage, I will replace missing values in each set separately
X_train['Education_Level'].fillna("Graduate", inplace = True)
X_val['Education_Level'].fillna("Graduate", inplace = True)
X_test['Education_Level'].fillna("Graduate", inplace = True)
X_train['Marital_Status'].fillna("Married", inplace = True)
X_val['Marital_Status'].fillna("Married", inplace = True)
X_test['Marital_Status'].fillna("Married", inplace = True)
X_train['Income_Category'].replace("abc", "Less than 40K", inplace = True)
X_val['Income_Category'].replace("abc", "Less than 40K", inplace = True)
X_test['Income_Category'].replace("abc", "Less than 40K", inplace = True)
# confirming that all missing data is filled
X_train.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 6075 entries, 800 to 4035 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Customer_Age 6075 non-null int64 1 Gender 6075 non-null category 2 Dependent_count 6075 non-null int64 3 Education_Level 6075 non-null category 4 Marital_Status 6075 non-null category 5 Income_Category 6075 non-null category 6 Card_Category 6075 non-null category 7 Months_on_book 6075 non-null int64 8 Total_Relationship_Count 6075 non-null int64 9 Months_Inactive_12_mon 6075 non-null int64 10 Contacts_Count_12_mon 6075 non-null int64 11 Total_Revolving_Bal 6075 non-null int64 12 Total_Amt_Chng_Q4_Q1 6075 non-null float64 13 Total_Trans_Amt 6075 non-null int64 14 Total_Trans_Ct 6075 non-null int64 15 Total_Ct_Chng_Q4_Q1 6075 non-null float64 16 Credit_Limit_Log 6075 non-null float64 17 Avg_Open_To_Buy_Sqrt 6075 non-null float64 18 Avg_Utilization_Ratio_Sqrt 6075 non-null float64 dtypes: category(5), float64(5), int64(9) memory usage: 742.4 KB
X_test.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2026 entries, 9760 to 413 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Customer_Age 2026 non-null int64 1 Gender 2026 non-null category 2 Dependent_count 2026 non-null int64 3 Education_Level 2026 non-null category 4 Marital_Status 2026 non-null category 5 Income_Category 2026 non-null category 6 Card_Category 2026 non-null category 7 Months_on_book 2026 non-null int64 8 Total_Relationship_Count 2026 non-null int64 9 Months_Inactive_12_mon 2026 non-null int64 10 Contacts_Count_12_mon 2026 non-null int64 11 Total_Revolving_Bal 2026 non-null int64 12 Total_Amt_Chng_Q4_Q1 2026 non-null float64 13 Total_Trans_Amt 2026 non-null int64 14 Total_Trans_Ct 2026 non-null int64 15 Total_Ct_Chng_Q4_Q1 2026 non-null float64 16 Credit_Limit_Log 2026 non-null float64 17 Avg_Open_To_Buy_Sqrt 2026 non-null float64 18 Avg_Utilization_Ratio_Sqrt 2026 non-null float64 dtypes: category(5), float64(5), int64(9) memory usage: 248.1 KB
X_val.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2026 entries, 2894 to 6319 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Customer_Age 2026 non-null int64 1 Gender 2026 non-null category 2 Dependent_count 2026 non-null int64 3 Education_Level 2026 non-null category 4 Marital_Status 2026 non-null category 5 Income_Category 2026 non-null category 6 Card_Category 2026 non-null category 7 Months_on_book 2026 non-null int64 8 Total_Relationship_Count 2026 non-null int64 9 Months_Inactive_12_mon 2026 non-null int64 10 Contacts_Count_12_mon 2026 non-null int64 11 Total_Revolving_Bal 2026 non-null int64 12 Total_Amt_Chng_Q4_Q1 2026 non-null float64 13 Total_Trans_Amt 2026 non-null int64 14 Total_Trans_Ct 2026 non-null int64 15 Total_Ct_Chng_Q4_Q1 2026 non-null float64 16 Credit_Limit_Log 2026 non-null float64 17 Avg_Open_To_Buy_Sqrt 2026 non-null float64 18 Avg_Utilization_Ratio_Sqrt 2026 non-null float64 dtypes: category(5), float64(5), int64(9) memory usage: 248.1 KB
# confirming that "abc" is no longer a value in "Income_Category"
X_train.Income_Category.value_counts()
Less than $40K 2129 $40K - $60K 1059 $80K - $120K 953 $60K - $80K 831 Less than 40K 654 $120K + 449 Name: Income_Category, dtype: int64
X_test.Income_Category.value_counts()
Less than $40K 696 $40K - $60K 370 $60K - $80K 292 $80K - $120K 289 Less than 40K 237 $120K + 142 Name: Income_Category, dtype: int64
X_val.Income_Category.value_counts()
Less than $40K 736 $40K - $60K 361 $80K - $120K 293 $60K - $80K 279 Less than 40K 221 $120K + 136 Name: Income_Category, dtype: int64
# Performing one-hot encoding on categorical variables
X_train = pd.get_dummies(X_train, drop_first = True)
X_test = pd.get_dummies(X_test, drop_first = True)
X_val = pd.get_dummies(X_val, drop_first = True)
# Confirming that it happened
X_train.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 6075 entries, 800 to 4035 Data columns (total 30 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Customer_Age 6075 non-null int64 1 Dependent_count 6075 non-null int64 2 Months_on_book 6075 non-null int64 3 Total_Relationship_Count 6075 non-null int64 4 Months_Inactive_12_mon 6075 non-null int64 5 Contacts_Count_12_mon 6075 non-null int64 6 Total_Revolving_Bal 6075 non-null int64 7 Total_Amt_Chng_Q4_Q1 6075 non-null float64 8 Total_Trans_Amt 6075 non-null int64 9 Total_Trans_Ct 6075 non-null int64 10 Total_Ct_Chng_Q4_Q1 6075 non-null float64 11 Credit_Limit_Log 6075 non-null float64 12 Avg_Open_To_Buy_Sqrt 6075 non-null float64 13 Avg_Utilization_Ratio_Sqrt 6075 non-null float64 14 Gender_M 6075 non-null uint8 15 Education_Level_Doctorate 6075 non-null uint8 16 Education_Level_Graduate 6075 non-null uint8 17 Education_Level_High School 6075 non-null uint8 18 Education_Level_Post-Graduate 6075 non-null uint8 19 Education_Level_Uneducated 6075 non-null uint8 20 Marital_Status_Married 6075 non-null uint8 21 Marital_Status_Single 6075 non-null uint8 22 Income_Category_$40K - $60K 6075 non-null uint8 23 Income_Category_$60K - $80K 6075 non-null uint8 24 Income_Category_$80K - $120K 6075 non-null uint8 25 Income_Category_Less than $40K 6075 non-null uint8 26 Income_Category_Less than 40K 6075 non-null uint8 27 Card_Category_Gold 6075 non-null uint8 28 Card_Category_Platinum 6075 non-null uint8 29 Card_Category_Silver 6075 non-null uint8 dtypes: float64(5), int64(9), uint8(16) memory usage: 806.8 KB
X_test.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2026 entries, 9760 to 413 Data columns (total 30 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Customer_Age 2026 non-null int64 1 Dependent_count 2026 non-null int64 2 Months_on_book 2026 non-null int64 3 Total_Relationship_Count 2026 non-null int64 4 Months_Inactive_12_mon 2026 non-null int64 5 Contacts_Count_12_mon 2026 non-null int64 6 Total_Revolving_Bal 2026 non-null int64 7 Total_Amt_Chng_Q4_Q1 2026 non-null float64 8 Total_Trans_Amt 2026 non-null int64 9 Total_Trans_Ct 2026 non-null int64 10 Total_Ct_Chng_Q4_Q1 2026 non-null float64 11 Credit_Limit_Log 2026 non-null float64 12 Avg_Open_To_Buy_Sqrt 2026 non-null float64 13 Avg_Utilization_Ratio_Sqrt 2026 non-null float64 14 Gender_M 2026 non-null uint8 15 Education_Level_Doctorate 2026 non-null uint8 16 Education_Level_Graduate 2026 non-null uint8 17 Education_Level_High School 2026 non-null uint8 18 Education_Level_Post-Graduate 2026 non-null uint8 19 Education_Level_Uneducated 2026 non-null uint8 20 Marital_Status_Married 2026 non-null uint8 21 Marital_Status_Single 2026 non-null uint8 22 Income_Category_$40K - $60K 2026 non-null uint8 23 Income_Category_$60K - $80K 2026 non-null uint8 24 Income_Category_$80K - $120K 2026 non-null uint8 25 Income_Category_Less than $40K 2026 non-null uint8 26 Income_Category_Less than 40K 2026 non-null uint8 27 Card_Category_Gold 2026 non-null uint8 28 Card_Category_Platinum 2026 non-null uint8 29 Card_Category_Silver 2026 non-null uint8 dtypes: float64(5), int64(9), uint8(16) memory usage: 269.1 KB
X_val.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2026 entries, 2894 to 6319 Data columns (total 30 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Customer_Age 2026 non-null int64 1 Dependent_count 2026 non-null int64 2 Months_on_book 2026 non-null int64 3 Total_Relationship_Count 2026 non-null int64 4 Months_Inactive_12_mon 2026 non-null int64 5 Contacts_Count_12_mon 2026 non-null int64 6 Total_Revolving_Bal 2026 non-null int64 7 Total_Amt_Chng_Q4_Q1 2026 non-null float64 8 Total_Trans_Amt 2026 non-null int64 9 Total_Trans_Ct 2026 non-null int64 10 Total_Ct_Chng_Q4_Q1 2026 non-null float64 11 Credit_Limit_Log 2026 non-null float64 12 Avg_Open_To_Buy_Sqrt 2026 non-null float64 13 Avg_Utilization_Ratio_Sqrt 2026 non-null float64 14 Gender_M 2026 non-null uint8 15 Education_Level_Doctorate 2026 non-null uint8 16 Education_Level_Graduate 2026 non-null uint8 17 Education_Level_High School 2026 non-null uint8 18 Education_Level_Post-Graduate 2026 non-null uint8 19 Education_Level_Uneducated 2026 non-null uint8 20 Marital_Status_Married 2026 non-null uint8 21 Marital_Status_Single 2026 non-null uint8 22 Income_Category_$40K - $60K 2026 non-null uint8 23 Income_Category_$60K - $80K 2026 non-null uint8 24 Income_Category_$80K - $120K 2026 non-null uint8 25 Income_Category_Less than $40K 2026 non-null uint8 26 Income_Category_Less than 40K 2026 non-null uint8 27 Card_Category_Gold 2026 non-null uint8 28 Card_Category_Platinum 2026 non-null uint8 29 Card_Category_Silver 2026 non-null uint8 dtypes: float64(5), int64(9), uint8(16) memory usage: 269.1 KB
def confusion_matrix_sklearn(model, predictors, target):
"""
To plot the confusion_matrix with percentages
model: classifier
predictors: independent variables
target: dependent variable
"""
y_pred = model.predict(predictors)
cm = confusion_matrix(target, y_pred)
labels = np.asarray(
[
["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
for item in cm.flatten()
]
).reshape(2, 2)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=labels, fmt="")
plt.ylabel("True label")
plt.xlabel("Predicted label")
# defining a function to compute different metrics to check performance of a classification model built using sklearn
def model_performance_classification_sklearn(model, predictors, target):
"""
Function to compute different metrics to check classification model performance
model: classifier
predictors: independent variables
target: dependent variable
"""
# predicting using the independent variables
pred = model.predict(predictors)
acc = accuracy_score(target, pred) # to compute Accuracy
recall = recall_score(target, pred) # to compute Recall
precision = precision_score(target, pred) # to compute Precision
f1 = f1_score(target, pred) # to compute F1-score
# creating a dataframe of metrics
df_perf = pd.DataFrame(
{
"Accuracy": acc,
"Recall": recall,
"Precision": precision,
"F1": f1,
},
index=[0],
)
return df_perf
# Model 1: Logistic Regression
log_reg = LogisticRegression(random_state=1)
log_reg.fit(X_train, y_train)
LogisticRegression(random_state=1)
# I will evaluate the model performance by using KFold
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_bfr = cross_val_score(
estimator=log_reg, X=X_train, y=y_train, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_bfr)
plt.show()
# Calculating different metrics on train set
log_reg_model_train_perf = model_performance_classification_sklearn(
log_reg, X_train, y_train
)
print("Training performance:")
log_reg_model_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.871934 | 0.434426 | 0.652308 | 0.521525 |
# Calculating different metrics on validation set
log_reg_model_val_perf = model_performance_classification_sklearn(
log_reg, X_val, y_val)
print("Validation performance:")
log_reg_model_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.880553 | 0.490798 | 0.677966 | 0.569395 |
# creating confusion matrix
confusion_matrix_sklearn(log_reg, X_val, y_val)
# Over-Sampling with SMOTE
print("Before UpSampling, counts of label 'Yes': {}".format(sum(y_train == 1)))
print("Before UpSampling, counts of label 'No': {} \n".format(sum(y_train == 0)))
sm = SMOTE(
sampling_strategy=1, k_neighbors=5, random_state=1
) # Synthetic Minority Over Sampling Technique
X_train_over, y_train_over = sm.fit_resample(X_train, y_train)
print("After UpSampling, counts of label 'Yes': {}".format(sum(y_train_over == 1)))
print("After UpSampling, counts of label 'No': {} \n".format(sum(y_train_over == 0)))
print("After UpSampling, the shape of train_X: {}".format(X_train_over.shape))
print("After UpSampling, the shape of train_y: {} \n".format(y_train_over.shape))
Before UpSampling, counts of label 'Yes': 976 Before UpSampling, counts of label 'No': 5099 After UpSampling, counts of label 'Yes': 5099 After UpSampling, counts of label 'No': 5099 After UpSampling, the shape of train_X: (10198, 30) After UpSampling, the shape of train_y: (10198,)
# counts are now equal.
# Logistic Regression on Over-Sampled Data
log_reg_over = LogisticRegression(random_state=1)
# Training the basic logistic regression model with training set
log_reg_over.fit(X_train_over, y_train_over)
LogisticRegression(random_state=1)
# K-Fold
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_over = cross_val_score(
estimator=log_reg_over, X=X_train_over, y=y_train_over, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_over)
plt.show()
# Calculating different metrics on train set
log_reg_over_train_perf = model_performance_classification_sklearn(
log_reg_over, X_train_over, y_train_over
)
print("Training performance:")
log_reg_over_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.840753 | 0.849971 | 0.834585 | 0.842208 |
# Calculating different metrics on validation set
log_reg_over_model_val_perf = model_performance_classification_sklearn(
log_reg_over, X_val, y_val
)
print("validation performance:")
log_reg_over_model_val_perf
validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.843534 | 0.843558 | 0.508318 | 0.634371 |
# creating confusion matrix
confusion_matrix_sklearn(log_reg_over, X_val, y_val)
# Under-sampling with Random Under Sampler
rus = RandomUnderSampler(random_state=1)
X_train_un, y_train_un = rus.fit_resample(X_train, y_train)
print("Before Under Sampling, counts of label 'Yes': {}".format(sum(y_train == 1)))
print("Before Under Sampling, counts of label 'No': {} \n".format(sum(y_train == 0)))
print("After Under Sampling, counts of label 'Yes': {}".format(sum(y_train_un == 1)))
print("After Under Sampling, counts of label 'No': {} \n".format(sum(y_train_un == 0)))
print("After Under Sampling, the shape of train_X: {}".format(X_train_un.shape))
print("After Under Sampling, the shape of train_y: {} \n".format(y_train_un.shape))
Before Under Sampling, counts of label 'Yes': 976 Before Under Sampling, counts of label 'No': 5099 After Under Sampling, counts of label 'Yes': 976 After Under Sampling, counts of label 'No': 976 After Under Sampling, the shape of train_X: (1952, 30) After Under Sampling, the shape of train_y: (1952,)
# counts are now equal
# Logistic Regression on Under-Sampled Data
log_reg_under = LogisticRegression(random_state=1)
log_reg_under.fit(X_train_un, y_train_un)
LogisticRegression(random_state=1)
# K-Fold
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_under = cross_val_score(
estimator=log_reg_under, X=X_train_un, y=y_train_un, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_under)
plt.show()
# Calculating different metrics on train set
log_reg_under_train_perf = model_performance_classification_sklearn(
log_reg_under, X_train_un, y_train_un
)
print("Training performance:")
log_reg_under_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.822746 | 0.83709 | 0.813745 | 0.825253 |
# Calculating different metrics on validation set
log_reg_under_model_val_perf = model_performance_classification_sklearn(
log_reg_under, X_val, y_val
)
print("Validation performance:")
log_reg_under_model_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.816387 | 0.846626 | 0.461538 | 0.597403 |
# creating confusion matrix
confusion_matrix_sklearn(log_reg_under, X_val, y_val)
# Model 2: Decision Tree Classifier
dTree = DecisionTreeClassifier(criterion = 'gini', random_state=1)
dTree.fit(X_train, y_train)
DecisionTreeClassifier(random_state=1)
# K-Fold
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_bfr = cross_val_score(
estimator=dTree, X=X_train, y=y_train, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_bfr)
plt.show()
# Calculating different metrics on train set
dTree_model_train_perf = model_performance_classification_sklearn(
dTree, X_train, y_train
)
print("Training performance:")
dTree_model_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 1.0 | 1.0 | 1.0 | 1.0 |
# Calculating different metrics on validation set
dTree_model_val_perf = model_performance_classification_sklearn(
dTree, X_val, y_val
)
print("Validation performance:")
dTree_model_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.936328 | 0.803681 | 0.801223 | 0.80245 |
# creating confusion matrix
confusion_matrix_sklearn(dTree, X_val, y_val)
# Decision Tree on Over-Sampled Data
# Oversampling sets have already been created, applying decision tree to them
dTree_over = DecisionTreeClassifier(random_state=1)
# Training the basic logistic regression model with training set
dTree_over.fit(X_train_over, y_train_over)
DecisionTreeClassifier(random_state=1)
# K-Fold
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_bfr = cross_val_score(
estimator=dTree_over, X=X_train_over, y=y_train_over, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_bfr)
plt.show()
# Calculating different metrics on train set
dTree_over_model_train_perf = model_performance_classification_sklearn(
dTree_over, X_train_over, y_train_over
)
print("Training performance:")
dTree_over_model_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 1.0 | 1.0 | 1.0 | 1.0 |
# Calculating different metrics on validation set
dTree_over_model_val_perf = model_performance_classification_sklearn(
dTree_over, X_val, y_val
)
print("validation performance:")
dTree_over_model_val_perf
validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.929911 | 0.849693 | 0.748649 | 0.795977 |
# creating confusion matrix
confusion_matrix_sklearn(dTree_over, X_val, y_val)
# Decision tree on Under-Sampled data
dTree_under = DecisionTreeClassifier(random_state=1)
dTree_under.fit(X_train_un, y_train_un)
DecisionTreeClassifier(random_state=1)
# K-Fold
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_bfr = cross_val_score(
estimator=dTree_under, X=X_train_un, y=y_train_un, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_bfr)
plt.show()
# Calculating different metrics on train set
dTree_under_model_train_perf = model_performance_classification_sklearn(
dTree_under, X_train_un, y_train_un
)
print("Training performance:")
dTree_under_model_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 1.0 | 1.0 | 1.0 | 1.0 |
# Calculating different metrics on validation set
dTree_under_model_val_perf = model_performance_classification_sklearn(
dTree_under, X_val, y_val
)
print("Validation performance:")
dTree_under_model_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.892892 | 0.898773 | 0.614256 | 0.729763 |
# creating confusion matrix
confusion_matrix_sklearn(dTree_under, X_val, y_val)
# Model 3: Bagging Classifier
bagging_lr=BaggingClassifier(base_estimator=LogisticRegression(random_state=1),random_state=1)
bagging_lr.fit(X_train,y_train)
BaggingClassifier(base_estimator=LogisticRegression(random_state=1),
random_state=1)
# K-Fold
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_bfr = cross_val_score(
estimator=bagging_lr, X=X_train, y=y_train, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_bfr)
plt.show()
# Calculating different metrics on train set
bagging_lr_model_train_perf = model_performance_classification_sklearn(
bagging_lr, X_train, y_train
)
print("Training performance:")
bagging_lr_model_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.88642 | 0.492828 | 0.711538 | 0.582324 |
# Calculating different metrics on validation set
bagging_lr_model_val_perf = model_performance_classification_sklearn(
bagging_lr, X_val, y_val
)
print("Validation performance:")
bagging_lr_model_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.896841 | 0.546012 | 0.74477 | 0.630088 |
# creating confusion matrix
confusion_matrix_sklearn(bagging_lr, X_val, y_val)
# Bagging Estimator on Over-Sampled Data
bagging_lr_over = BaggingClassifier(base_estimator=LogisticRegression(random_state=1),random_state=1)
# Training the bagging estimator with training set
bagging_lr_over.fit(X_train_over, y_train_over)
BaggingClassifier(base_estimator=LogisticRegression(random_state=1),
random_state=1)
# K-Fold
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_bfr = cross_val_score(
estimator=bagging_lr_over, X=X_train_over, y=y_train_over, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_bfr)
plt.show()
# Calculating different metrics on train set
bagging_lr_over_model_train_perf = model_performance_classification_sklearn(
bagging_lr_over, X_train_over, y_train_over
)
print("Training performance:")
bagging_lr_over_model_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.843008 | 0.858207 | 0.832889 | 0.845359 |
# Calculating different metrics on validation set
bagging_lr_over_model_val_perf = model_performance_classification_sklearn(
bagging_lr_over, X_val, y_val
)
print("Validation performance:")
bagging_lr_over_model_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.837611 | 0.846626 | 0.497297 | 0.626561 |
# creating confusion matrix
confusion_matrix_sklearn(bagging_lr_over, X_val, y_val)
# Bagging Estimator on Under-Sampled Data
bagging_lr_under = BaggingClassifier(base_estimator=LogisticRegression(random_state=1),random_state=1)
# Training the bagging estimator with training set
bagging_lr_under.fit(X_train_un, y_train_un)
BaggingClassifier(base_estimator=LogisticRegression(random_state=1),
random_state=1)
# K-Fold
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_bfr = cross_val_score(
estimator=bagging_lr_under, X=X_train_un, y=y_train_un, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_bfr)
plt.show()
# Calculating different metrics on train set
bagging_lr_under_model_train_perf = model_performance_classification_sklearn(
bagging_lr_under, X_train_un, y_train_un
)
print("Training performance:")
bagging_lr_under_model_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.824283 | 0.838115 | 0.815553 | 0.82668 |
# Calculating different metrics on validation set
bagging_lr_under_model_val_perf = model_performance_classification_sklearn(
bagging_lr_under, X_val, y_val
)
print("Validation performance:")
bagging_lr_under_model_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.815893 | 0.837423 | 0.460371 | 0.594124 |
# creating confusion matrix
confusion_matrix_sklearn(bagging_lr_under, X_val, y_val)
# Model 4: Random Forest Estimator
rf_estimator=RandomForestClassifier(random_state=1)
rf_estimator.fit(X_train,y_train)
RandomForestClassifier(random_state=1)
# K-Fold
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_bfr = cross_val_score(
estimator=rf_estimator, X=X_train, y=y_train, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_bfr)
plt.show()
# Calculating different metrics on train set
rf_estimator_model_train_perf = model_performance_classification_sklearn(
rf_estimator, X_train, y_train
)
print("Training performance:")
rf_estimator_model_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 1.0 | 1.0 | 1.0 | 1.0 |
# Calculating different metrics on validation set
rf_estimator_model_val_perf = model_performance_classification_sklearn(
rf_estimator, X_val, y_val
)
print("Validation performance:")
rf_estimator_model_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.956071 | 0.794479 | 0.921708 | 0.853377 |
# creating confusion matrix
confusion_matrix_sklearn(rf_estimator, X_val, y_val)
# Random Forest Estimator on Over-Sampled Data
rf_estimator_over=RandomForestClassifier(random_state=1)
rf_estimator_over.fit(X_train_over,y_train_over)
RandomForestClassifier(random_state=1)
# K-Fold
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_bfr = cross_val_score(
estimator=rf_estimator_over, X=X_train_over, y=y_train_over, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_bfr)
plt.show()
# Calculating different metrics on train set
rf_estimator_over_model_train_perf = model_performance_classification_sklearn(
rf_estimator_over, X_train_over, y_train_over
)
print("Training performance:")
rf_estimator_over_model_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 1.0 | 1.0 | 1.0 | 1.0 |
# Calculating different metrics on validation set
rf_estimator_over_model_val_perf = model_performance_classification_sklearn(
rf_estimator_over, X_val, y_val
)
print("Validation performance:")
rf_estimator_over_model_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.956565 | 0.861963 | 0.867284 | 0.864615 |
# creating confusion matrix
confusion_matrix_sklearn(rf_estimator_over, X_val, y_val)
# Random Forest Estimator on Under-Sampled Data
rf_estimator_under=RandomForestClassifier(random_state=1)
rf_estimator_under.fit(X_train_un,y_train_un)
RandomForestClassifier(random_state=1)
# K-Fold
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_bfr = cross_val_score(
estimator=rf_estimator_under, X=X_train_un, y=y_train_un, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_bfr)
plt.show()
# Calculating different metrics on train set
rf_estimator_under_model_train_perf = model_performance_classification_sklearn(
rf_estimator_under, X_train_un, y_train_un
)
print("Training performance:")
rf_estimator_under_model_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 1.0 | 1.0 | 1.0 | 1.0 |
# Calculating different metrics on validation set
rf_estimator_under_model_val_perf = model_performance_classification_sklearn(
rf_estimator_under, X_val, y_val
)
print("Validation performance:")
rf_estimator_under_model_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.928924 | 0.920245 | 0.717703 | 0.806452 |
# creating confusion matrix
confusion_matrix_sklearn(rf_estimator_under, X_val, y_val)
# Model 5: Gradient Boosting Classifier
gbc = GradientBoostingClassifier(random_state=1)
gbc.fit(X_train,y_train)
GradientBoostingClassifier(random_state=1)
# K-Fold
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_bfr = cross_val_score(
estimator=gbc, X=X_train, y=y_train, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_bfr)
plt.show()
# Calculating different metrics on train set
gbc_model_train_perf = model_performance_classification_sklearn(
gbc, X_train, y_train
)
print("Training performance:")
gbc_model_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.97284 | 0.875 | 0.952062 | 0.911906 |
# Calculating different metrics on validation set
gbc_model_val_perf = model_performance_classification_sklearn(
gbc, X_val, y_val
)
print("Validation performance:")
gbc_model_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.967423 | 0.855828 | 0.936242 | 0.894231 |
# creating confusion matrix
confusion_matrix_sklearn(gbc, X_val, y_val)
# Gradient Boosting Classifier on Over-Sampled Data
gbc_over=GradientBoostingClassifier(random_state=1)
gbc_over.fit(X_train_over,y_train_over)
GradientBoostingClassifier(random_state=1)
# K-Fold
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_bfr = cross_val_score(
estimator=gbc_over, X=X_train_over, y=y_train_over, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_bfr)
plt.show()
# Calculating different metrics on train set
gbc_over_model_train_perf = model_performance_classification_sklearn(
gbc_over, X_train_over, y_train_over
)
print("Training performance:")
gbc_over_model_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.979212 | 0.982546 | 0.976037 | 0.979281 |
# Calculating different metrics on validation set
gbc_over_model_val_perf = model_performance_classification_sklearn(
gbc_over, X_val, y_val
)
print("Validation performance:")
gbc_over_model_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.9615 | 0.898773 | 0.866864 | 0.88253 |
# creating confusion matrix
confusion_matrix_sklearn(gbc_over, X_val, y_val)
# Gradient Boosting Classifier on Under-Sampled Data
gbc_under=GradientBoostingClassifier(random_state=1)
gbc_under.fit(X_train_un,y_train_un)
GradientBoostingClassifier(random_state=1)
# K-Fold
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_bfr = cross_val_score(
estimator=gbc_under, X=X_train_un, y=y_train_un, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_bfr)
plt.show()
# Calculating different metrics on train set
gbc_under_model_train_perf = model_performance_classification_sklearn(
gbc_under, X_train_un, y_train_un
)
print("Training performance:")
gbc_under_model_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.973873 | 0.979508 | 0.968592 | 0.974019 |
# Calculating different metrics on validation set
gbc_under_model_val_perf = model_performance_classification_sklearn(
gbc_under, X_val, y_val
)
print("Validation performance:")
gbc_under_model_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.936821 | 0.957055 | 0.732394 | 0.829787 |
# creating confusion matrix
confusion_matrix_sklearn(gbc_under, X_val, y_val)
# Model 6: Extreme Gradient Boosting
xgb = XGBClassifier(random_state=1,eval_metric='logloss')
xgb.fit(X_train,y_train)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
gamma=0, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.300000012,
max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=100, n_jobs=4,
num_parallel_tree=1, random_state=1, reg_alpha=0, reg_lambda=1,
scale_pos_weight=1, subsample=1, tree_method='exact',
validate_parameters=1, verbosity=None)
# K-Fold
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_bfr = cross_val_score(
estimator=xgb, X=X_train, y=y_train, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_bfr)
plt.show()
# Calculating different metrics on train set
xgb_model_train_perf = model_performance_classification_sklearn(
xgb, X_train, y_train
)
print("Training performance:")
xgb_model_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 1.0 | 1.0 | 1.0 | 1.0 |
# Calculating different metrics on validation set
xgb_model_val_perf = model_performance_classification_sklearn(
xgb, X_val, y_val
)
print("Validation performance:")
xgb_model_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.968411 | 0.886503 | 0.914557 | 0.900312 |
# creating confusion matrix
confusion_matrix_sklearn(xgb, X_val, y_val)
# Extreme Gradient Boosting on Over-Sampled Data
xgb_over = XGBClassifier(random_state=1,eval_metric='logloss')
xgb_over.fit(X_train_over,y_train_over)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
gamma=0, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.300000012,
max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=100, n_jobs=4,
num_parallel_tree=1, random_state=1, reg_alpha=0, reg_lambda=1,
scale_pos_weight=1, subsample=1, tree_method='exact',
validate_parameters=1, verbosity=None)
# K-Fold
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_bfr = cross_val_score(
estimator=xgb_over, X=X_train_over, y=y_train_over, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_bfr)
plt.show()
# Calculating different metrics on train set
xgb_over_model_train_perf = model_performance_classification_sklearn(
xgb_over, X_train_over, y_train_over
)
print("Training performance:")
xgb_over_model_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 1.0 | 1.0 | 1.0 | 1.0 |
# Calculating different metrics on validation set
xgb_over_model_val_perf = model_performance_classification_sklearn(
xgb_over, X_val, y_val
)
print("Validation performance:")
xgb_over_model_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.968904 | 0.91411 | 0.894895 | 0.904401 |
# creating confusion matrix
confusion_matrix_sklearn(xgb_over, X_val, y_val)
# Extreme Gradient Boosting with Under-Sampled Data
xgb_under = XGBClassifier(random_state=1,eval_metric='logloss')
xgb_under.fit(X_train_un,y_train_un)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
gamma=0, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.300000012,
max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=100, n_jobs=4,
num_parallel_tree=1, random_state=1, reg_alpha=0, reg_lambda=1,
scale_pos_weight=1, subsample=1, tree_method='exact',
validate_parameters=1, verbosity=None)
# K-Fold
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_bfr = cross_val_score(
estimator=xgb_under, X=X_train_un, y=y_train_un, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_bfr)
plt.show()
# Calculating different metrics on train set
xgb_under_model_train_perf = model_performance_classification_sklearn(
xgb_under, X_train_un, y_train_un
)
print("Training performance:")
xgb_under_model_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 1.0 | 1.0 | 1.0 | 1.0 |
# Calculating different metrics on validation set
xgb_under_model_val_perf = model_performance_classification_sklearn(
xgb_under, X_val, y_val
)
print("Validation performance:")
xgb_under_model_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.940276 | 0.947853 | 0.748184 | 0.836265 |
# creating confusion matrix
confusion_matrix_sklearn(xgb_under, X_val, y_val)
# Validation performance comparison
models_val_comp_df = pd.concat(
[
log_reg_model_val_perf.T,
log_reg_over_model_val_perf.T,
log_reg_under_model_val_perf.T,
dTree_model_val_perf.T,
dTree_over_model_val_perf.T,
dTree_under_model_val_perf.T,
bagging_lr_model_val_perf.T,
bagging_lr_over_model_val_perf.T,
bagging_lr_under_model_val_perf.T,
rf_estimator_model_val_perf.T,
rf_estimator_over_model_val_perf.T,
rf_estimator_under_model_val_perf.T,
gbc_model_val_perf.T,
gbc_over_model_val_perf.T,
gbc_under_model_val_perf.T,
xgb_model_val_perf.T,
xgb_over_model_val_perf.T,
xgb_under_model_val_perf.T
],
axis=1,
)
models_val_comp_df.columns = [
"Logistic Regression",
"Logistic Regression with oversampled data",
"Logistic Regression with undersampled data",
"Decision Tree",
"Decision Tree with oversampled data",
"Decision Tree with undersampled data",
"Bagging Estimator",
"Bagging Estimator with oversampled data",
"Bagging Estimator with undersampled data",
"Random Forest Estimator",
"Random Forest Estimator with oversampled data",
"Random Forest Estimator with undersampled data",
"Gradient Boosting Classifier",
"Gradient Boosting Classifier with oversampled data",
"Gradient Boosting Classifier with undersampled data",
"Extreme Gradient Boosting",
"Extreme Gradient Boosting with oversampled data",
"Extreme Gradient Boosting with undersampled data",
]
print("Validation performance comparison:")
models_val_comp_df
Validation performance comparison:
| Logistic Regression | Logistic Regression with oversampled data | Logistic Regression with undersampled data | Decision Tree | Decision Tree with oversampled data | Decision Tree with undersampled data | Bagging Estimator | Bagging Estimator with oversampled data | Bagging Estimator with undersampled data | Random Forest Estimator | Random Forest Estimator with oversampled data | Random Forest Estimator with undersampled data | Gradient Boosting Classifier | Gradient Boosting Classifier with oversampled data | Gradient Boosting Classifier with undersampled data | Extreme Gradient Boosting | Extreme Gradient Boosting with oversampled data | Extreme Gradient Boosting with undersampled data | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Accuracy | 0.880553 | 0.843534 | 0.816387 | 0.936328 | 0.929911 | 0.892892 | 0.896841 | 0.837611 | 0.815893 | 0.956071 | 0.956565 | 0.928924 | 0.967423 | 0.961500 | 0.936821 | 0.968411 | 0.968904 | 0.940276 |
| Recall | 0.490798 | 0.843558 | 0.846626 | 0.803681 | 0.849693 | 0.898773 | 0.546012 | 0.846626 | 0.837423 | 0.794479 | 0.861963 | 0.920245 | 0.855828 | 0.898773 | 0.957055 | 0.886503 | 0.914110 | 0.947853 |
| Precision | 0.677966 | 0.508318 | 0.461538 | 0.801223 | 0.748649 | 0.614256 | 0.744770 | 0.497297 | 0.460371 | 0.921708 | 0.867284 | 0.717703 | 0.936242 | 0.866864 | 0.732394 | 0.914557 | 0.894895 | 0.748184 |
| F1 | 0.569395 | 0.634371 | 0.597403 | 0.802450 | 0.795977 | 0.729763 | 0.630088 | 0.626561 | 0.594124 | 0.853377 | 0.864615 | 0.806452 | 0.894231 | 0.882530 | 0.829787 | 0.900312 | 0.904401 | 0.836265 |
# I will start by tuning the Extreme Gradient Boosting Classifier
import sklearn.metrics as metrics
from sklearn.model_selection import RandomizedSearchCV
# defining model
model1 = XGBClassifier(random_state=1,eval_metric='logloss')
# Parameter grid to pass in RandomizedSearchCV
param_grid={'n_estimators':np.arange(50,150,50),
'scale_pos_weight':[2,5,10],
'learning_rate':[0.01,0.1,0.2,0.05],
'gamma':[0,1,3,5],
'subsample':[0.8,0.9,1],
'max_depth':np.arange(1,5,1),
'reg_lambda':[5,10]}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
xgb_under_tuned = RandomizedSearchCV(estimator=model1, param_distributions=param_grid, n_iter=50, scoring=scorer, cv=5, random_state=1, n_jobs = -1)
#Fitting parameters in RandomizedSearchCV
xgb_under_tuned.fit(X_train_un,y_train_un)
print("Best parameters are {} with CV score={}:" .format(xgb_under_tuned.best_params_,xgb_under_tuned.best_score_))
Best parameters are {'subsample': 0.9, 'scale_pos_weight': 10, 'reg_lambda': 5, 'n_estimators': 50, 'max_depth': 1, 'learning_rate': 0.01, 'gamma': 1} with CV score=1.0:
# Calculating different metrics on train set
xgboost_random_train_under = model_performance_classification_sklearn(
xgb_under_tuned, X_train_un, y_train_un
)
print("Training performance:")
xgboost_random_train_under
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.5 | 1.0 | 0.5 | 0.666667 |
# Calculating different metrics on validation set
xgboost_random_val_under = model_performance_classification_sklearn(
xgb_under_tuned, X_val, y_val
)
print("Validation performance:")
xgboost_random_val_under
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.160908 | 1.0 | 0.160908 | 0.277211 |
# creating confusion matrix
confusion_matrix_sklearn(xgb_under_tuned, X_val, y_val)
# Next, I will tune the Random Forest Classifier
# defining model
model2 = RandomForestClassifier(random_state=1)
# Grid of parameters to choose from
# add from article
param_grid2 = {"n_estimators": [150,200,250],
"min_samples_leaf": np.arange(5, 10),
"max_features": np.arange(0.2, 0.7, 0.1),
"max_samples": np.arange(0.3, 0.7, 0.1),
}
# Type of scoring used to compare parameter combinations
scorer2 = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
rf_estimator_under_tuned = RandomizedSearchCV(estimator=model2, param_distributions=param_grid2, n_iter=50, scoring=scorer, cv=5, random_state=1, n_jobs = -1)
#Fitting parameters in RandomizedSearchCV
rf_estimator_under_tuned.fit(X_train_un,y_train_un)
print("Best parameters are {} with CV score={}:" .format(rf_estimator_under_tuned.best_params_,rf_estimator_under_tuned.best_score_))
rf_estimator_tuned = RandomForestClassifier(random_state=1)
Best parameters are {'n_estimators': 200, 'min_samples_leaf': 6, 'max_samples': 0.6000000000000001, 'max_features': 0.5000000000000001} with CV score=0.9436630036630037:
# Calculating different metrics on train set
rf_estimator_random_train_under = model_performance_classification_sklearn(
rf_estimator_under_tuned, X_train_un, y_train_un
)
print("Training performance:")
rf_estimator_random_train_under
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.961578 | 0.976434 | 0.948259 | 0.96214 |
# Calculating different metrics on validation set
rf_estimator_random_val_under = model_performance_classification_sklearn(
rf_estimator_under_tuned, X_val, y_val
)
print("Validation performance:")
rf_estimator_random_val_under
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.915597 | 0.95092 | 0.666667 | 0.783818 |
# creating confusion matrix
confusion_matrix_sklearn(rf_estimator_under_tuned, X_val, y_val)
# Finally, I will tune the Gradient Boosting Classifier
# defining model
model3 = GradientBoostingClassifier(random_state=1)
# Grid of parameters to choose from
# add from article
param_grid3 = {
"n_estimators": [100,150,200,250],
"subsample":[0.8,0.9,1],
"max_features":[0.7,0.8,0.9,1]
}
# Type of scoring used to compare parameter combinations
scorer3 = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
gbc_under_tuned = RandomizedSearchCV(estimator=model3, param_distributions=param_grid3, n_iter=50, scoring=scorer3, cv=5, random_state=1, n_jobs = -1)
#Fitting parameters in RandomizedSearchCV
gbc_under_tuned.fit(X_train_un,y_train_un)
print("Best parameters are {} with CV score={}:" .format(gbc_under_tuned.best_params_,gbc_under_tuned.best_score_))
Best parameters are {'subsample': 0.9, 'n_estimators': 250, 'max_features': 0.8} with CV score=0.959010989010989:
# Calculating different metrics on train set
gbc_random_train_under = model_performance_classification_sklearn(
gbc_under_tuned, X_train_un, y_train_un
)
print("Training performance:")
gbc_random_train_under
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.997439 | 1.0 | 0.994903 | 0.997445 |
# Calculating different metrics on validation set
gbc_random_val_under = model_performance_classification_sklearn(
gbc_under_tuned, X_val, y_val
)
print("Validation performance:")
gbc_random_val_under
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.947187 | 0.972393 | 0.763855 | 0.855601 |
# creating confusion matrix
confusion_matrix_sklearn(gbc_under_tuned, X_val, y_val)
# building model with best parameters
Final_Model = GradientBoostingClassifier(
subsample = 0.9,
n_estimators = 250,
max_features = 0.8
)
# Fit the model on training data
Final_Model.fit(X_train_un, y_train_un)
GradientBoostingClassifier(max_features=0.8, n_estimators=250, subsample=0.9)
Final_Model_Train = model_performance_classification_sklearn( Final_Model, X_train_un, y_train_un ) print("Training performance:") Final_Model_Train
Final_Model_Train = model_performance_classification_sklearn(
Final_Model, X_train_un, y_train_un
)
print("Train performance:")
Final_Model_Train
Train performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.997439 | 1.0 | 0.994903 | 0.997445 |
Final_Model_Val = model_performance_classification_sklearn(
Final_Model, X_val, y_val
)
print("Validation performance:")
Final_Model_Val
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.949161 | 0.969325 | 0.772616 | 0.859864 |
# Finally, we can measure performance on the test data
Final_Model_Test = model_performance_classification_sklearn(
Final_Model, X_test, y_test
)
print("Test performance:")
Final_Model_Test
Test performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.945212 | 0.966154 | 0.758454 | 0.849797 |
feature_names = X_test.columns
importances = Final_Model.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
# creating a transformer for categorical variables, which will first apply simple imputer and
# then do one hot encoding for categorical variables
categorical_features = ["Gender", "Education_Level", "Marital_Status",
"Income_Category", "Card_Category"]
categorical_transformer = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="most_frequent")),
("onehot", OneHotEncoder(handle_unknown="ignore")),
]
)
preprocessor = ColumnTransformer(
transformers=[
("cat", categorical_transformer, categorical_features),
],
remainder="passthrough",
)
X = df.drop(columns="Attrition_Flag")
Y = df["Attrition_Flag"]
# We already know the best model, so we don't need a validation set
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.30, random_state=1, stratify=Y
)
print(X_train.shape, X_test.shape)
(7088, 19) (3039, 19)
# Under-sampling with Random Under Sampler
rus = RandomUnderSampler(random_state=1)
X_train_un, y_train_un = rus.fit_resample(X_train, y_train)
# Creating new pipeline with best parameters
pipe = Pipeline(
steps=[
("pre", preprocessor),
(
"GBC",
GradientBoostingClassifier(
subsample = 0.9,
n_estimators = 250,
max_features = 0.8
),
)
]
)
# Fit the model on training data
pipe.fit(X_train_un, y_train_un)
Pipeline(steps=[('pre',
ColumnTransformer(remainder='passthrough',
transformers=[('cat',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('onehot',
OneHotEncoder(handle_unknown='ignore'))]),
['Gender', 'Education_Level',
'Marital_Status',
'Income_Category',
'Card_Category'])])),
('GBC',
GradientBoostingClassifier(max_features=0.8, n_estimators=250,
subsample=0.9))])
Pipe_Test = model_performance_classification_sklearn(
pipe, X_test, y_test
)
print("Test performance:")
Pipe_Test
Test performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.952287 | 0.969262 | 0.784411 | 0.867094 |
# creating confusion matrix
confusion_matrix_sklearn(pipe, X_test, y_test)